    # for put block in bowl
COLORS = ['blue', 'red', 'green', 'orange', 'purple', 'pink', 'white', 'yellow', 'brown', 'gray', 'cyan']
OBJ = ['bowl', 'block']

@torch.no_grad()
def _filter(
    self, image,
    detections, obj_name,
    location, frame_path=None,
    visualize_failure=False,
    use_color=True):
    """
    Returns a list of bboxs bounding boxes from detections which are obj_name
        image: H, W, 3
        detections: Candidate boxes [N, 4] (class agnostic)
        obj_name: string label of object of interest [eg: "cat"]
        location: choose one of them based on location hint if its not none           
    """
    # get image patches
    image_patches = self._crop_img_inside_bbox(image, detections)

    processed_patches = []
    for patch in image_patches:
        processed_patches.append(
        self.clip_preprocess(
            Image.fromarray(patch.numpy(), 'RGB')).to(self.device))

    # obj_cat = obj_name.split()[-1]
    # obj_color = obj_name.split()[0]
    if use_color:
        OBJECTS = COLOR_OBJECTS
    else:
        OBJECTS = ALL_CLIPORT_OBJECTS
    class_list = [f'this is a photo of {obj}' for obj in OBJECTS]
    
    class_text = clip.tokenize(class_list).to(self.device)
    
    # run clip!
    processed_patches = torch.stack(processed_patches)
    logits_per_image, _ = self.clip_model(processed_patches, class_text)
    pred_scores = logits_per_image.softmax(dim=-1) # N
    
    obj_names = []
    all_pred_boxes = []
    
    if obj_name[-1] == 's':
        obj_name = obj_name[:-1]
    
    if ' and ' in obj_name:
        obj_list = obj_name.split('and')
        base_object = ' '.join(obj_list[1].split()[1:]).strip()
        
        # make everything singular
        # if base_object[-1] == 's':
            # base_object = base_object[:-1]
            # obj_list[1] = obj_list[1].strip()[:-1]
            
        obj0 = obj_list[0].split(' all ')[-1].strip()
        obj_names.append(obj0 + ' ' + base_object)
        obj_names.append(obj_list[1].strip())
    else:
        obj_names.append(obj_name)

    for oname in obj_names:
        assert oname in OBJECTS, f"not found {oname} in vocabulary"
        gt_class = OBJECTS.index(oname)
        
        # keep detections with predicted class label 0
        pred_boxes = detections[torch.argsort(pred_scores[:, gt_class])[-2:]]
        # pred_boxes = detections[preds == gt_class]
        
        # return most confident box in this case
        if len(pred_boxes) == 0:
            # failure case
            print(f"No confident box found for {oname}")
            st()
            pred_boxes = detections[logits_per_image.softmax(dim=-1)[:, gt_class].argmax()].unsqueeze(0)
            
            if visualize_failure:
                self._visualize(image, detections=detections, concept='scene', caption='all_detections')
                self._visualize(image, pred_boxes, concept='filter', caption=f'filter_{oname}')
            
            if frame_path is not None:
                frame_num = frame_path.split('/')[-1].split('_')[0]
                if frame_num not in self.bad_frames:
                    self.bad_frames.append(frame_num)
        all_pred_boxes.append(pred_boxes)
    
    all_pred_boxes = torch.cat(all_pred_boxes, dim=0)
    
    if location == 'none':
        return all_pred_boxes
    else:
        # handle location here
        assert False, "not implemented yet"
        pass

COLORS = ['blue', 'red', 'green', 'orange', 'purple', 'pink', 'white', 'yellow', 'brown', 'gray', 'cyan']
OBJ = ['block'] #['bowl', 'block', 'ring']

COLOR_AGNOSTIC_OBJECTS = ['rope', '3-sided frame']

COLOR_OBJECTS = ['blue block', 'red block', 'green block', 'orange block', 'purple block', 'pink block', 'white block', 'yellow block', 'brown block', 'gray block', 'cyan block', 'brown box']

# Filter for packing task  
@torch.no_grad()
def _filter(
    self, image,
    detections, obj_name,
    location, frame_path=None,
    visualize_failure=False,
    use_color=True):
    """
    Returns a list of bboxs bounding boxes from detections which are obj_name
        image: H, W, 3
        detections: Candidate boxes [N, 4] (class agnostic)
        obj_name: string label of object of interest [eg: "cat"]
        location: choose one of them based on location hint if its not none           
    """
    # get image patches
    image_patches = self._crop_img_inside_bbox(image, detections)

    processed_patches = []
    for patch in image_patches:
        processed_patches.append(
        self.clip_preprocess(
            Image.fromarray(patch.numpy(), 'RGB')).to(self.device))

    # obj_cat = obj_name.split()[-1]
    # obj_color = obj_name.split()[0]
    if use_color:
        OBJECTS = COLOR_OBJECTS
    else:
        OBJECTS = ALL_CLIPORT_OBJECTS
    class_list = [f'this is a photo of {obj}' for obj in OBJECTS]
    
    class_text = clip.tokenize(class_list).to(self.device)
    
    # run clip!
    processed_patches = torch.stack(processed_patches)
    logits_per_image, _ = self.clip_model(processed_patches, class_text)
    preds = logits_per_image.softmax(dim=-1).argmax(-1) # N
    
    obj_names = []
    all_pred_boxes = []
    
    if obj_name[-1] == 's':
        obj_name = obj_name[:-1]
    
    if ' and ' in obj_name:
        obj_list = obj_name.split('and')
        base_object = ' '.join(obj_list[1].split()[1:]).strip()
        
        # make everything singular
        # if base_object[-1] == 's':
            # base_object = base_object[:-1]
            # obj_list[1] = obj_list[1].strip()[:-1]
        obj0 = obj_list[0].split('all')[-1].strip().split()[-1]
        obj_names.append(obj0 + ' ' + base_object)
        obj_names.append(obj_list[1].strip())
    elif 'all the' in obj_name:
        obj_names.append(obj_name.split('all the')[-1].strip())
    else:
        obj_names.append(obj_name)

    for oname in obj_names:
        if oname not in OBJECTS:
            st()
        assert oname in OBJECTS, f"not found {oname} in vocabulary"
        gt_class = OBJECTS.index(oname)
        
        # keep detections with predicted class label 0
        pred_boxes = detections[preds == gt_class]
        # pred_boxes = detections[preds == gt_class]
        
        # return most confident box in this case
        if len(pred_boxes) == 0:
            # failure case
            print(f"No confident box found for {oname}")
            pred_boxes = detections[logits_per_image.softmax(dim=-1)[:, gt_class].argmax()].unsqueeze(0)
            
            if visualize_failure:
                self._visualize(image, detections=detections, concept='scene', caption='all_detections')
                self._visualize(image, pred_boxes, concept='filter', caption=f'filter_{oname}')
            
            if frame_path is not None:
                frame_num = frame_path.split('/')[-1].split('_')[0]
                if frame_num not in self.bad_frames:
                    self.bad_frames.append(frame_num)
        all_pred_boxes.append(pred_boxes)
    
    all_pred_boxes = torch.cat(all_pred_boxes, dim=0)
    
    if location == 'none':
        return all_pred_boxes
    else:
        # handle location here
        assert False, "not implemented yet"
        pass

